# Data manipulation and visualization
library(tidyverse)
library(here)
library(janitor)
library(patchwork)
#library(grid)
#library(gridExtra)
#library(ghibli)
#library(RColorBrewer)
#library(gt)
#library(gtExtras)
# Data analysis/statistics
library(car) #ANOVA
library(ggeffects)
library(effects) #dependency for ggeffects
library(ggResidpanel) #residual panel
library(GGally)
library(arm) #discrete.histogram() function
library(MASS) #required for arm package
dispersion <- function (model){
sum(residuals(model, type = "pearson")^2)/(length(model$y)-length(model$coefficients))
}
The variables with are interested in are: - turf algae (column ‘ta’) - hard coral (column ‘hard_coral’) - sand (column ‘sand’) - crustose coralline algae (column ‘cca’)
fish <- read_csv(here("HW5","data","CRCP_Reef_Fish_Surveys_kole.csv"))
fish1 <- fish[,c("ta","hard_coral", "sand", "cca")]
ggpairs(fish1)
For each of the four predictors, make single-predictor models where the response variable is kole counts. Consider which of the predictors could be transformed to reduce skew along the x-axis. For educational purposes, start by using a poisson distribution for the counts.
#Based on the frequency distribution of the turf algae (ta) data, the predictor does not seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$ta)
ta_lm <- lm(count ~ ta, data = fish)
summary(ta_lm)
##
## Call:
## lm(formula = count ~ ta, data = fish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.110 -11.896 -6.206 4.173 111.395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.11012 3.22407 9.029 < 2e-16 ***
## ta -0.26263 0.05234 -5.018 8.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.31 on 308 degrees of freedom
## Multiple R-squared: 0.07557, Adjusted R-squared: 0.07257
## F-statistic: 25.18 on 1 and 308 DF, p-value: 8.854e-07
resid_panel(ta_lm, plots = c("resid", "qq", "lev", "hist"))
ta_glm <- glm(count ~ ta, family= poisson, data = fish)
summary(ta_glm)
##
## Call:
## glm(formula = count ~ ta, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5784202 0.0352931 101.4 <2e-16 ***
## ta -0.0177069 0.0006581 -26.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6862.4 on 308 degrees of freedom
## AIC: 7783.4
##
## Number of Fisher Scoring iterations: 6
resid_panel(ta_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(ta_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(ta_glm) %>% plot(log_y = TRUE)
ggeff
Anova(ta_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## ta 710.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$ta, prob.col = 'orange', xlim = c(0, 100), ylim = c(0,0.18))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.15, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
Overdispersion: Poisson and Binomial distributions
makes assumptions on the variability of the data which is used for the
calculations of likelihood. Larger variability will result in smaller
Confidence Intervals (CI) and p-values, which is inaccurate.
With a dispersion of 30.10 (>>>1), the data is over dispersed
and it will result in smaller CI and p-values than it should.
dispersion(ta_glm)
## [1] 30.10215
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$hard_coral)
coral_lm <- lm(count ~ hard_coral, data = fish)
summary(coral_lm)
##
## Call:
## lm(formula = count ~ hard_coral, data = fish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.440 -9.619 -6.177 3.039 111.774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.31592 1.69524 3.136 0.00188 **
## hard_coral 0.43034 0.06312 6.818 4.88e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.69 on 308 degrees of freedom
## Multiple R-squared: 0.1311, Adjusted R-squared: 0.1283
## F-statistic: 46.49 on 1 and 308 DF, p-value: 4.876e-11
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
coral_glm <- glm(count ~ hard_coral, family= poisson, data = fish)
summary(coral_glm)
##
## Call:
## glm(formula = count ~ hard_coral, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0379324 0.0257021 79.29 <2e-16 ***
## hard_coral 0.0243714 0.0006953 35.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6481.7 on 308 degrees of freedom
## AIC: 7402.7
##
## Number of Fisher Scoring iterations: 6
resid_panel(coral_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(coral_glm)
ggeff <- ggeffect(coral_glm) %>% plot(log_y = TRUE)
ggeff
Anova(coral_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## hard_coral 1091.4 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$hard_coral, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(coral_glm)
## [1] 28.13073
With a dispersion of 28.13 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$sand)
sand_glm <- glm(count ~ sand, family= poisson, data = fish)
summary(sand_glm)
##
## Call:
## glm(formula = count ~ sand, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.946727 0.018964 155.39 <2e-16 ***
## sand -0.035770 0.001685 -21.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6998.7 on 308 degrees of freedom
## AIC: 7919.6
##
## Number of Fisher Scoring iterations: 6
resid_panel(sand_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(sand_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(sand_glm) %>% plot(log_y = TRUE)
ggeff
Anova(sand_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## sand 574.42 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$sand, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(sand_glm)
## [1] 30.97697
With a dispersion of 30.98 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$cca)
cca_glm <- glm(count ~ cca, family= poisson, data = fish)
summary(cca_glm)
##
## Call:
## glm(formula = count ~ cca, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.270516 0.021449 105.85 <2e-16 ***
## cca 0.034430 0.001153 29.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6846.7 on 308 degrees of freedom
## AIC: 7767.6
##
## Number of Fisher Scoring iterations: 6
resid_panel(cca_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(cca_glm)
ggeff <- ggeffect(cca_glm) %>% plot(log_y = TRUE)
ggeff
Anova(cca_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## cca 726.46 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$cca, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(cca_glm)
## [1] 30.8293
With a dispersion of 30.83 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
To deal with over-dispersion, we can use a negative binomial distribution or use random effects.
#negative binomial distribution
ta_nb <- glm.nb(count ~ ta, data = fish)
summary(ta_nb)
##
## Call:
## glm.nb(formula = count ~ ta, data = fish, init.theta = 0.3750484251,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.64697 0.26206 13.92 < 2e-16 ***
## ta -0.01900 0.00427 -4.45 8.57e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.375) family taken to be 1)
##
## Null deviance: 365.50 on 309 degrees of freedom
## Residual deviance: 346.15 on 308 degrees of freedom
## AIC: 2085.5
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.3750
## Std. Err.: 0.0325
##
## 2 x log-likelihood: -2079.4990
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(ta_nb) %>% plot(log_y=TRUE)
ggeff
Anova(ta_nb)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## ta 19.342 1 1.093e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overdispersion: With a dispersion of 0.98, the overdispersion is fixed.
dispersion(ta_nb)
## [1] 0.9872545
#negative binomial distribution
hc_nb <- glm.nb(count ~ hard_coral, data = fish)
summary(hc_nb)
##
## Call:
## glm.nb(formula = count ~ hard_coral, data = fish, init.theta = 0.3931303705,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.858032 0.140369 13.237 < 2e-16 ***
## hard_coral 0.031720 0.005176 6.129 8.86e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.3931) family taken to be 1)
##
## Null deviance: 379.65 on 309 degrees of freedom
## Residual deviance: 346.22 on 308 degrees of freedom
## AIC: 2072.6
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.3931
## Std. Err.: 0.0344
##
## 2 x log-likelihood: -2066.6070
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(hc_nb) %>% plot(log_y=TRUE)
ggeff
Anova(hc_nb)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## hard_coral 33.437 1 7.363e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overdispersion: With a dispersion of 1.08, the overdispersion is fixed.
dispersion(hc_nb)
## [1] 1.075455
#negative binomial distribution
sand_nb <- glm.nb(count ~ sand, data = fish)
summary(sand_nb)
##
## Call:
## glm.nb(formula = count ~ sand, data = fish, init.theta = 0.3729581071,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.941783 0.124216 23.68 < 2e-16 ***
## sand -0.035008 0.007544 -4.64 3.48e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.373) family taken to be 1)
##
## Null deviance: 363.85 on 309 degrees of freedom
## Residual deviance: 345.96 on 308 degrees of freedom
## AIC: 2086.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.3730
## Std. Err.: 0.0323
##
## 2 x log-likelihood: -2080.8530
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(sand_nb) %>% plot(log_y=TRUE)
ggeff
Anova(sand_nb)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## sand 17.888 1 2.343e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overdispersion: With a dispersion of 0.95, the overdispersion is fixed.
dispersion(sand_nb)
## [1] 0.9493442
#negative binomial distribution
cca_nb <- glm.nb(count ~ cca, data = fish)
summary(cca_nb)
##
## Call:
## glm.nb(formula = count ~ cca, data = fish, init.theta = 0.3839858353,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.999507 0.126584 15.796 < 2e-16 ***
## cca 0.060098 0.009659 6.222 4.91e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.384) family taken to be 1)
##
## Null deviance: 372.51 on 309 degrees of freedom
## Residual deviance: 345.99 on 308 degrees of freedom
## AIC: 2078.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.3840
## Std. Err.: 0.0334
##
## 2 x log-likelihood: -2072.8340
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(cca_nb) %>% plot(log_y=TRUE)
ggeff
Anova(cca_nb)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## cca 26.529 1 2.597e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overdispersion: With a dispersion of 0.99, the overdispersion is fixed.
dispersion(ta_nb)
## [1] 0.9872545
The dispersion values became closer to one using the negative binomial. This is because in this model, the variance is proportional to the mean and data points become corrected by a factor of “Phi”.
For this model, I tried a regular glm() with Poisson distribution and the dispersion was about 27. I decided to do negative binomial model to minimize the dispersion.
full_glm <- glm.nb(count ~ ta + hard_coral + sand + cca, data = fish)
summary(full_glm)
##
## Call:
## glm.nb(formula = count ~ ta + hard_coral + sand + cca, data = fish,
## init.theta = 0.4547766048, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.85507 1.66061 -1.719 0.08556 .
## ta 0.04659 0.01703 2.736 0.00622 **
## hard_coral 0.07439 0.01716 4.336 1.45e-05 ***
## sand 0.01835 0.01825 1.006 0.31453
## cca 0.09260 0.02028 4.566 4.96e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.4548) family taken to be 1)
##
## Null deviance: 426.74 on 309 degrees of freedom
## Residual deviance: 345.03 on 305 degrees of freedom
## AIC: 2038.2
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.4548
## Std. Err.: 0.0410
##
## 2 x log-likelihood: -2026.2040
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(full_glm) %>% plot(log_y=TRUE)
ggeff
## $ta
##
## $hard_coral
##
## $sand
##
## $cca
Anova(full_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## ta 6.2887 1 0.0121507 *
## hard_coral 14.3924 1 0.0001484 ***
## sand 1.0950 1 0.2953666
## cca 16.4564 1 4.978e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overdispersion: With a dispersion of 0.99, the overdispersion is fixed.
full_glm <- glm.nb(count ~ ta + hard_coral + sand + cca, data = fish)
dispersion(full_glm)
## [1] 1.512709
The resid_panel function did not work with the glm.nb().
Overall, it looks like corals and CCA are more associated with higher counts of Kole. We can possibly perform a linear regression and calculate the R-squared and p-values to estimate the significance of the correlation.